package com.airlocksoftware.hackernews.parser; import android.content.Context; import android.util.Log; import com.airlocksoftware.hackernews.data.ConnectionManager; import com.airlocksoftware.hackernews.data.UserPrefs; import com.airlocksoftware.hackernews.model.*; import org.jsoup.Connection; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.ListIterator; import java.util.regex.Matcher; import java.util.regex.Pattern; public class CommentsParser { public static final String THREAD_TIMESTAMP_ID = "Thread"; public static final String COMMENT_TIMESTAMP_ID = "Comment"; private static final String TAG = CommentsParser.class.getSimpleName(); private static final Pattern ID_PATTERN = Pattern.compile("(?<=id=)[0-9]+"); /** Object used to encapsulate the results of parsing a comments page. * */ public static class CommentsResponse { public CommentsResponse() { // empty constructor } public CommentsResponse(Result result) { this.result = result; } public Story story; public List<Comment> comments; public CommentsTimestamp timestamp; public Result result; } /** Object that encapsulates the results of parsing a user's Threads page. * */ public static class ThreadsResponse { public StoryTimestamp timestamp; public List<CommentThread> threads; public Result result; } /** Parses comment groups from a users "Threads" page * */ public static ThreadsResponse parseThreadsPage(Context context, String username) { return parseThreadsPage(context, username, null); } /** Parses comment groups from a users "Threads" page * */ public static ThreadsResponse parseThreadsPage(Context context, String username, String moreFnid) { ThreadsResponse response = new ThreadsResponse(); try { UserPrefs data = new UserPrefs(context); Document doc = getThreadsDocument(data, moreFnid, username); response.threads = parseCommentsThreads(doc, data.isLoggedIn()); response.timestamp = getNewThreadsTimestamp(doc, username); response.result = moreFnid != null ? Result.MORE : Result.SUCCESS; } catch (Exception e) { response.result = Result.FAILURE; } return response; } /** Parses a list of CommentThreads from the given document. * */ private static List<CommentThread> parseCommentsThreads(Document doc, boolean isLoggedIn) { Elements commentsContainer = doc.select("td.default"); ListIterator<Element> commentRows = commentsContainer.listIterator(); ArrayList<CommentThread> threads = new ArrayList<CommentThread>(); CommentThread currentThread = null; while (commentRows.hasNext()) { Element commentRow = commentRows.next(); CommentThread newThread = parseThread(commentRow); if (newThread != null) { threads.add(newThread); currentThread = newThread; } Comment comment = parseComment(commentRow, isLoggedIn); currentThread.comments.add(comment); } return threads; } /** * Checks whether a comment element contains the start of a new thread of comments (i.e. a link to a story) * If so, it parses and returns it. Else it returns null. */ private static CommentThread parseThread(Element comment) { Element comhead = comment.select("span.comhead") .first(); boolean isNewThread = comhead.children() .size() >= 4; // size doesn't include text nodes CommentThread thread = null; if (isNewThread) { thread = new CommentThread(); thread.comments = new ArrayList<Comment>(); thread.story = new Story(); Element titleLink = comhead.select("a") .last(); thread.story.title = titleLink.text(); // get id Matcher m = ID_PATTERN.matcher(titleLink.attr("href")); if (m.find()) thread.story.storyId = Long.parseLong(m.group()); } return thread; } /** Parses a story from the comments page identified by storyId * */ public static CommentsResponse parseCommentsPage(Context context, long storyId) { CommentsResponse response = new CommentsResponse(); try { UserPrefs data = new UserPrefs(context); Document doc = getCommentsDocument(data, storyId); // parse story Elements storyRows = getStoryRows(doc); Element line1 = storyRows.first(); Element line2 = doc.select("td.subtext").first(); response.story = StoryParser.parseStory(line1, line2, data.isLoggedIn()); response.story.storyId = storyId; response.timestamp = getNewCommentsTimestamp(storyId); // setup replyFnid, isArchived, & selfText Element replyInput = storyRows.select("form[action=comment]").first(); if (replyInput != null) { response.timestamp.parent = replyInput.select("input[name=parent]").first().attr("value"); response.timestamp.go_to = replyInput.select("input[name=goto]").first().attr("value"); response.timestamp.hmac = replyInput.select("input[name=hmac]").first().attr("value"); response.story.selfText = getSelfText(storyRows); response.story.isArchived = false; } else { // it's an archived story response.story.isArchived = true; response.story.selfText = getArchivedSelfText(storyRows); } response.comments = parseComments(doc, storyId, data.isLoggedIn()); response.result = Result.SUCCESS; } catch (Exception e) { response.result = Result.FAILURE; response.comments = new ArrayList<Comment>(); Log.e(TAG, "Error parsing comments", e); } return response; } /** Parses a list of comments from the given document. * */ private static List<Comment> parseComments(Document doc, long storyId, boolean isLoggedIn) { List<Comment> comments = new ArrayList<Comment>(); ListIterator<Element> commentRows = doc.select("td.default").listIterator(); while (commentRows.hasNext()) { Element commentContainer = commentRows.next(); Comment comment = parseComment(commentContainer, isLoggedIn); comment.storyId = storyId; comments.add(comment); } return comments; } /** Parses a comment from the Element containing it * */ private static Comment parseComment(Element commentContainer, boolean isLoggedIn) { Comment comment = new Comment(); comment.depth = getDepth(commentContainer); comment.html = getHtml(commentContainer); // setup default values comment.username = ""; comment.ago = ""; comment.commentId = -1; comment.auth = ""; comment.whence = ""; comment.replyUrl = ""; comment.isUpvoted = false; // if it's a deleted comment, return early if (comment.html.equals("")) { comment.username = "deleted"; comment.ago = "deleted"; comment.html = "deleted"; return comment; } Element comhead = getComhead(commentContainer); comment.replyUrl = getReplyUrl(commentContainer); comment.username = getUsername(comhead); comment.ago = getAgo(comhead); comment.commentId = getCommentId(comhead); Element voteAnchor = commentContainer.parent() .select("a[href^=vote") .first(); comment.isUpvoted = voteAnchor == null; if (isLoggedIn && !comment.isUpvoted) { String[] voteHref = voteAnchor.attr("href").split("[=&]"); comment.whence = voteHref[voteHref.length - 1]; comment.auth = voteHref[7]; } return comment; } private static long getCommentId(Element comhead) { String linkHref = comhead.select("a[href^=item]").attr("href"); Matcher matcher = ID_PATTERN.matcher(linkHref); if(matcher.find()) { return Long.parseLong(matcher.group()); } else { throw new IllegalStateException("Couldn't parse comment id from commentHeader"); } } private static String getAgo(Element comhead) { Element agoLink = comhead.select("a").get(1); return agoLink.text().replace("|", "").trim(); } private static String getUsername(Element comhead) { return comhead.select("a").first().text(); } private static Element getComhead(Element commentContainer) { return commentContainer.select("span.comhead").first(); } private static String getReplyUrl(Element commentContainer) { return commentContainer.select("span.comment a:containsOwn(reply)").attr("href"); } private static String getHtml(Element commentContainer) { Elements comment = commentContainer.select("div.comment > :not(p:has(font[size]))"); String html = comment.outerHtml(); // delete font tags from Html html = html.replaceAll("[<](/)?font[^>]*[>]", ""); return html; } private static int getDepth(Element commentContainer) { // CHANGED FROM // Element upvoteImg = commentContainer.parent() // .select("img[src^=http://ycombinator.com/images/]") // .first(); // IN RESPONSE TO CHANGE IN HTML FROM news.ycombinator.com Element upvoteImg = commentContainer.parent().select("img[src^=s.gif]").first(); return Integer.parseInt(upvoteImg.attr("width")) / 40; } private static String getArchivedSelfText(Elements storyRows) { // check if it has selfText if (storyRows.size() <= 2) return null; return storyRows.get(3).children().last().text(); } private static String getSelfText(Elements storyRows) { // check if it has selfText if (storyRows.size() <= 4) return null; // TODO switched this to 2 from 3 because it stopped working... need to figure out a more resilient way // to do this return storyRows.get(2).children().last().text(); } private static Elements getStoryRows(Document doc) { return doc.select("td.subtext").first().parent().siblingElements(); } private static CommentsTimestamp getNewCommentsTimestamp(long storyId) { CommentsTimestamp timestamp = new CommentsTimestamp(); timestamp.time = System.currentTimeMillis(); timestamp.primaryId = COMMENT_TIMESTAMP_ID; timestamp.secondaryId = Long.toString(storyId); return timestamp; } private static Document getCommentsDocument(UserPrefs data, long storyId) throws IOException { Connection con; if (data.isLoggedIn()) { con = ConnectionManager.authConnect(ConnectionManager.ITEMS_URL + Long.toString(storyId), data.getUserCookie()); } else { con = ConnectionManager.anonConnect(ConnectionManager.ITEMS_URL + Long.toString(storyId)); } return con.get(); } private static StoryTimestamp getNewThreadsTimestamp(Document doc, String username) { Element more = doc.select("td.title a") .first(); if (more == null) return null; StoryTimestamp timestamp = new StoryTimestamp(); timestamp.fnid = more.attr("href"); timestamp.time = System.currentTimeMillis(); timestamp.primaryId = THREAD_TIMESTAMP_ID; timestamp.secondaryId = username; return timestamp; } /** GETs the document specified by the parameters. * */ private static Document getThreadsDocument(UserPrefs data, String moreFnid, String username) throws IOException { Connection con; if (data.isLoggedIn()) { if (moreFnid != null) con = ConnectionManager.authConnect(moreFnid, data.getUserCookie()); else con = ConnectionManager.authConnect(ConnectionManager.THREADS_URL + username, data.getUserCookie()); } else { if (moreFnid != null) con = ConnectionManager.anonConnect(moreFnid); else con = ConnectionManager.anonConnect(ConnectionManager.THREADS_URL + username); } return con.get(); } }